In [1]:
import pandas as pd
import numpy as np
import pylab as pl
%matplotlib inline
pl.rcParams['font.family']='Serif'
In [2]:
path = '../data/stem-cell/'
file_names=['NG_norm_concat.csv','NN_norm_concat.csv','4FI_norm_concat.csv']
Okay, so these look like just the actual timestamps of when they went through the machine, not the barcode
In [3]:
NN_filenames = ['NN_00.export.csv',
'NN_02.export.csv',
'NN_04.export.csv',
'NN_06.export.csv',
'NN_08.export.csv',
'NN_10.export.csv',
'NN_12.export.csv',
'NN_14.export.csv',
'NN_16.export.csv',
'NN_18.export.csv',
'NN_20.export.csv',
'NN_22.export.csv',
'NN_24.export.csv',
'NN_30.export.csv']
In [4]:
NG_filenames = ['NG_00.export.csv',
'NG_02.export.csv',
'NG_04.export.csv',
'NG_06.export.csv',
'NG_08.export.csv',
'NG_10.export.csv',
'NG_12.export.csv',
'NG_14.export.csv',
'NG_16.export.csv',
'NG_18.export.csv',
'NG_20.export.csv',
'NG_22.export.csv',
'NG_24.export.csv',
'NG_30.export.csv']
In [5]:
Oct4_filenames = [
'4FI_00.export.csv',
'4FI_01.export.csv',
'4FI_02.export.csv',
'4FI_04.export.csv',
'4FI_06.export.csv',
'4FI_08.export.csv',
'4FI_10.export.csv',
'4FI_12.export.csv',
'4FI_14.export.csv',
'4FI_16.export.csv',
'4FI_17.export.csv',
'4FI_18.export.csv',
'4FI_20.export.csv']
In [6]:
times = [int(s[3:5]) for s in NG_filenames]
print(times)
pl.plot(times)
Out[6]:
In [ ]:
times_oct = [int(s[4:6]) for s in Oct4_filenames]
print(times_oct)
pl.plot(times_oct)
In [11]:
times_nn = [int(s[3:5]) for s in NN_filenames]
In [7]:
times_oct = [int(s[4:6]) for s in Oct4_filenames]
print(times_oct)
pl.plot(times_oct)
Out[7]:
In [143]:
NG = [pd.read_csv(path+'NG/'+fname) for fname in NG_filenames]
NN = [pd.read_csv(path+'NN/'+fname) for fname in NN_filenames]
Oct4 = [pd.read_csv(path+'4FI/'+fname) for fname in Oct4_filenames]
In [144]:
NG[0].columns
Out[144]:
In [145]:
NG[0].columns[1:-3]
Out[145]:
In [146]:
pl.plot(times,[len(n) for n in NG],label='Nanog-GFP')
pl.plot(times,[len(n) for n in NN],label='Nanog-Neo')
pl.plot(times_oct,[len(n) for n in Oct4],label='Oct4-GFP')
pl.legend(loc='best')
pl.ylabel('Number of cells')
pl.xlabel('Day')
pl.title('Cells per day')
Out[146]:
In [147]:
max([len(n) for n in NG]) / min([len(n) for n in NG]),max([len(n) for n in NN]) / min([len(n) for n in NN])
Out[147]:
In [148]:
max([len(n) for n in Oct4]) / min([len(n) for n in Oct4])
Out[148]:
In [574]:
# construct a numpy array containing all of the NG_samples
all_data = np.vstack(n for n in NG)
data = all_data[:,2:-3]
print(data.shape)
In [592]:
data_p = np.arcsinh(data*15)
In [593]:
from time import time
from scipy.cluster import hierarchy
from sklearn.manifold import MDS
def embedding(X,labels=None,linkage='single',
profile=True):
''' (1) compute linkage matrix,
(2) compute pairwise distances using linkage matrix,
(3) compute embedding using distances'''
t = time()
n = len(X)
c = hierarchy.linkage(X,linkage)
d = distance.squareform(hierarchy.cophenet(c))
t1 = time()
print('Computed cophenetic correlations in {0:.2f}s'.format(t1-t))
mds = MDS(dissimilarity='precomputed')
X_ = mds.fit_transform(d[:n,:n])
t2 = time()
print('Computed MDS embedding in {0:.2f}s'.format(t2-t1))
if labels==None:
pl.scatter(X_[:,0],X_[:,1])
else:
pl.scatter(X_[:,0],X_[:,1],c=labels,
linewidths=0)
return X_,c
In [594]:
np.random.seed(0)
sample_ind = np.random.rand(len(data_p)) < 0.005
sum(sample_ind)
Out[594]:
In [595]:
sample = data_p[sample_ind]
results = embedding(sample)
In [596]:
t = day[sample_ind]
X_ = results[0]
pl.scatter(X_[:,0],X_[:,1],c=t,
linewidths=0)
pl.colorbar()
Out[596]:
In [591]:
seaborn.kdeplot(data[:,10])
pl.figure()
seaborn.kdeplot(np.log(data[:,10] - np.min(data[:,10])+1))
pl.figure()
seaborn.kdeplot(np.arcsinh(data[:,10]*15))
Out[591]:
In [531]:
seaborn.kdeplot(np.log(data[:,10:12] - np.min(data[:,10:12])+1))
Out[531]:
In [570]:
signed_log_data = np.log(abs(data[:,10]))*(2*(data[:,10]>0)-1)
seaborn.kdeplot(signed_log_data)
Out[570]:
In [552]:
lim=5
subset = abs(data[:,10])<lim
seaborn.kdeplot(data[:,10][subset])
pl.xlim(-lim,lim)
sum(subset)
Out[552]:
In [571]:
def signed_log(data):
return np.log(abs(data))*(2*(data>=0)-1)
In [573]:
x = np.arange(-100,100,0.0001)
y = signed_log(x)
pl.plot(x,y)
#pl.xlim(-1,1)
pl.figure()
pl.plot(x,np.arcsinh(x))
Out[573]:
In [564]:
x = np.arange(-100,100,0.0001)
y = np.log(x)
pl.plot(x,y)
pl.xlim(0,2)
Out[564]:
In [402]:
pl.hist(data[:,10],bins=100);
pl.figure()
pl.hist(data[:,10],bins=100,log=True);
In [151]:
processed = 1/ (1+np.exp(-data*10))
pl.hist(processed[:,10],bins=50);
In [152]:
processed = np.log(data - (np.min(data))+1);
pl.hist(processed[:,10],bins=50,normed=True,log=True);
#pl.xscale('log')
Out[152]:
In [453]:
#for i,cofactor in enumerate([0.001,0.01,0.1,0.2,0.3,0.4,0.5,1.0,2.0,5.0,10.0,100.0,1000.0]):
for i,cofactor in enumerate([10**i for i in np.arange(-1,2,0.05)]):
processed = np.arcsinh(data*cofactor)
#pl.hist(processed[:,10],bins=100);
seaborn.kdeplot(processed[:,10],shade=True)
pl.title('Cofactor: {0:.3f}'.format(cofactor))
pl.ylim(0,0.75)
pl.xlim(-10,15)
pl.savefig('../figures/effect-of-normalization/{0}.jpg'.format(i))
pl.close()
In [ ]:
In [430]:
processed = np.arcsinh(data*0.5)
seaborn.kdeplot(processed[:,30],shade=True)
Out[430]:
In [516]:
blah = np.random.randn(100000)
seaborn.kdeplot(blah,shade=True)
Out[516]:
In [517]:
#for i,cofactor in enumerate([0.001,0.01,0.1,0.2,0.3,0.4,0.5,1.0,2.0,5.0,10.0,100.0,1000.0]):
for i,cofactor in enumerate([10**i for i in np.arange(-1,2,0.05)[::-1]]):
proc_blah = np.arcsinh(blah/cofactor)
#pl.hist(processed[:,10],bins=100);
seaborn.kdeplot(proc_blah,shade=True)
pl.title('Cofactor: {0:.3f}'.format(cofactor))
#pl.ylim(0,0.1+np.max(proc_blah))
pl.ylim(0,4)
pl.xlim(-7,7)
pl.savefig('../figures/effect-of-normalization-on-normal/{0}.jpg'.format(i))
pl.close()
In [513]:
cofactors = [1,5,10,20,50,100]
cofactor = cofactors[-1]
for cofactor in cofactors:
pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactor),label=cofactor)
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[0]),label=cofactors[0])
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[1]),label=cofactors[1])
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[2]),label=cofactors[2])
#pl.ylabel(r'$\arcsin(x/\text{cofactor})$')
#pl.xlabel(r'$x$')
pl.xlabel('x')
pl.ylabel('arcsinh(x/cofactor)')
pl.title('Cofactors and linear region size')
pl.legend(loc='best')
Out[513]:
In [497]:
data = processed
In [378]:
data_nn = np.arcsinh(np.vstack(n for n in NN)[:,2:-3])
In [156]:
day = np.hstack(np.ones(len(n))*times[i] for i,n in enumerate(NG))
day.shape
Out[156]:
In [157]:
day_nn = np.hstack(np.ones(len(n))*times[i] for i,n in enumerate(NN))
day_nn.shape
Out[157]:
In [158]:
day_Oct4 = np.hstack(np.ones(len(n))*times_oct[i] for i,n in enumerate(Oct4))
day_Oct4.shape
Out[158]:
In [159]:
Oct4_processed = np.arcsinh(np.vstack(n for n in Oct4)[:,2:-3])
In [160]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(data)
Out[160]:
In [161]:
pl.plot(pca.explained_variance_ratio_)
sum(pca.explained_variance_ratio_[:2])
Out[161]:
In [162]:
pca = PCA(2)
X_ = pca.fit_transform(data)
X_.shape
Out[162]:
In [163]:
pl.scatter(X_[:,0],X_[:,1],c=day,linewidths=0,alpha=0.5)
pl.colorbar()
Out[163]:
In [164]:
Y_ = pca.inverse_transform(X_)
Y_.shape
Out[164]:
In [165]:
np.random.seed(0)
sample_size=10000
sample_ind=np.arange(len(data))[np.random.rand(len(data))<(1.0*sample_size/len(data))]
sample = data[sample_ind]
sample.shape
Out[165]:
In [166]:
pca = PCA(2)
X_ = pca.fit_transform(sample)
X_.shape
Out[166]:
In [167]:
pl.plot(pca.explained_variance_ratio_)
sum(pca.explained_variance_ratio_[:2])
Out[167]:
In [168]:
X_ = X_[:,:2]
In [169]:
X_.shape,data.shape
Out[169]:
In [170]:
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.xlabel("PC1")
pl.ylabel("PC2")
pl.title('PCA embedding of 10k-point subsample')
pl.colorbar()
Out[170]:
In [34]:
Y_ = pca.inverse_transform(X_)
In [40]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=10)
lle.fit(sample)
Out[40]:
In [41]:
X_ = lle.embedding_
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('LLE embedding of 10k-point subsample')
pl.colorbar()
Out[41]:
In [37]:
from scipy.spatial import distance
pdist_orig = distance.pdist(sample)
pdist_reconstructed = distance.pdist(Y_)
In [38]:
diff = distance.squareform(pdist_orig) - distance.squareform(pdist_reconstructed)
In [39]:
np.sqrt(np.sum(diff**2))
Out[39]:
In [40]:
np.linalg.norm(diff)
Out[40]:
In [41]:
np.linalg.norm(diff) / len(sample)
Out[41]:
In [42]:
X_.shape,data.shape
Out[42]:
In [51]:
from sklearn import neighbors
def one_nn_baseline(X,Y):
one_nn_X = neighbors.kneighbors_graph(X,2)
one_nn_Y = neighbors.kneighbors_graph(Y,2)
sames = 0
for i in range(len(X)):
neighbor_X = one_nn_X[i].indices[one_nn_X[i].indices!=i][0]
neighbor_Y = one_nn_Y[i].indices[one_nn_Y[i].indices!=i][0]
if neighbor_X == neighbor_Y:
sames+=1
return 1.0*sames / len(X)
In [45]:
one_nn_baseline(sample,X_)
Out[45]:
In [46]:
knn_X = neighbors.kneighbors_graph(X_,3)
knn_X[0].indices[knn_X[0].indices!=0]
Out[46]:
In [47]:
sum(knn_X[0].indices[knn_X[0].indices!=0]==knn_X[0].indices[knn_X[0].indices!=0])
Out[47]:
In [62]:
s = set(range(10))
s.intersection(range(5))
Out[62]:
In [52]:
def knn_baseline(X,Y,k=5):
k = k+1 # since self is counted as a neighbor in the kneighbors graph
knn_X = neighbors.kneighbors_graph(X,k)
knn_Y = neighbors.kneighbors_graph(Y,k)
sames = 0
for i in range(len(X)):
neighbors_X = set(knn_X[i].indices[knn_X[i].indices!=i])
neighbors_Y = set(knn_Y[i].indices[knn_Y[i].indices!=i])
sames += len(neighbors_X.intersection(neighbors_Y))
#sames += sum(neighbors_X == neighbors_Y)
#sames
return 1.0*sames / (len(X)*(k-1))
In [77]:
knn_baseline(sample,X_,5)
Out[77]:
In [52]:
sample.shape,X_.shape
Out[52]:
In [50]:
pca = PCA(10)
X_ = pca.fit_transform(sample)
one_nn_baseline(sample,X_)
In [34]:
knn_baseline(sample,X_,5)
Out[34]:
In [45]:
from sklearn.cluster import DBSCAN
In [58]:
pl.hist(distance.pdist(sample),bins=50);
pl.figure()
pl.hist(distance.pdist(X_),bins=50);
From Wikipedia:
In [78]:
def k_dist(X,k=5):
k = k+1 # since self is counted as a neighbor in the kneighbors graph
knn = neighbors.kneighbors_graph(X,k,mode='distance')
dist = np.zeros(len(X))
for i in range(len(X)):
dist[i] = knn[i].T[knn[i].indices[-1]].data[0]
return dist
In [49]:
X_.shape
Out[49]:
In [87]:
d = k_dist(X_,X_.shape[1])
In [88]:
pl.hist(d,bins=50);
In [89]:
knn[10].indices[-1]
Out[89]:
In [86]:
dbs = DBSCAN(min_samples=50,eps=4.3)
In [87]:
dbs.fit(X_)
Out[87]:
In [88]:
pl.hist(dbs.labels_,bins=len(set(dbs.labels_)));
In [89]:
pl.scatter(X_[:,0],X_[:,1],c=dbs.labels_,linewidths=0,alpha=0.5)
pl.xlabel("PC1")
pl.ylabel("PC2")
pl.title('PCA embedding of 10k-point subsample')
pl.colorbar()
Out[89]:
In [109]:
d = k_dist(sample,sample.shape[1])
pl.hist(d,bins=50);
In [119]:
dbs = DBSCAN(min_samples=sample.shape[1],eps=9.0)
dbs.fit(sample)
Out[119]:
In [120]:
pl.hist(dbs.labels_,bins=len(set(dbs.labels_)));
In [31]:
from sklearn.manifold import Isomap
iso = Isomap(10)
iso.fit(sample)
X_ = iso.embedding_
print(one_nn_baseline(sample,X_))
print(knn_baseline(sample,X_,5))
In [53]:
from sklearn.manifold import Isomap
iso = Isomap()
iso.fit(sample)
Out[53]:
In [54]:
iso.reconstruction_error()
Out[54]:
In [78]:
X_ = iso.embedding_
In [67]:
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('Isomap embedding of 10k-point subsample')
pl.colorbar()
Out[67]:
In [68]:
one_nn_baseline(sample,X_)
Out[68]:
In [79]:
knn_baseline(sample,X_,5)
Out[79]:
In [126]:
sample.shape
Out[126]:
In [142]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=2,kernel='poly',degree=2)
X_ = kpca.fit_transform(sample)
In [143]:
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('KPCA embedding of 10k-point subsample')
pl.colorbar()
Out[143]:
In [144]:
from sklearn.svm import SVR
In [148]:
svr = SVR('poly')
In [ ]:
svr.fit(sample,day[sample_ind])
In [147]:
svr.score(sample,day[sample_ind])
Out[147]:
In [ ]:
from sklearn.kernel_approximation
In [ ]:
from sklearn.manifold import SpectralEmbedding
se = SpectralEmbedding()
X_ = se.fit_transform(sample)
In [ ]:
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('Spectral embedding of 10k-point subsample')
pl.colorbar()
In [ ]:
In [ ]:
one_nn_baseline(sample,X_)
In [ ]:
knn_baseline(sample,X_,10)
In [171]:
from sklearn.cross_decomposition import PLSRegression
In [172]:
pls = PLSRegression(2)
In [173]:
len(day),len(data)
Out[173]:
In [174]:
np.random.seed(0)
ind = np.arange(len(day))
ind_nn = np.arange(len(day_nn))
np.random.shuffle(ind)
np.random.shuffle(ind_nn)
split = 0.9
train,test = ind[:len(ind)*split],ind[len(ind)*split:]
train_nn,test_nn = ind_nn[:len(ind_nn)*split],ind_nn[len(ind_nn)*split:]
In [175]:
len(train),len(test),len(train_nn),len(test_nn)
Out[175]:
In [176]:
train
Out[176]:
In [177]:
data.shape,data[train].shape,data[test].shape
Out[177]:
In [178]:
pls.fit(data[train],day[train])
Out[178]:
In [179]:
pls.score(data[train],day[train]),pls.score(data[test],day[test])
Out[179]:
In [180]:
pls.fit(data_nn[train_nn],day_nn[train_nn])
pls.score(data_nn[train_nn],day_nn[train_nn]),pls.score(data_nn[test_nn],day_nn[test_nn])
Out[180]:
In [181]:
train_scores = []
test_scores = []
models = []
ns = list(range(1,10)) + [15,20,25,30,35,40]
for i in ns:
pls = PLSRegression(i)
pls.fit(data[train],day[train])
train_scores.append(pls.score(data[train],day[train]))
test_scores.append(pls.score(data[test],day[test]))
models.append(pls)
In [182]:
pl.plot(ns,train_scores,label='Train (90% of data)')
pl.plot(ns,test_scores,label='Test (10% of data)')
pl.xlabel('Number of Components')
pl.ylabel('Score')
pl.legend(loc='best')
pl.title('PLS Regression Scores (Nanog-GFP)')
Out[182]:
In [82]:
train_scores = []
test_scores = []
models = []
ns = list(range(1,10)) + [15,20,25,30,35,40]
for i in ns:
pls = PLSRegression(i)
pls.fit(data_nn[train_nn],day_nn[train_nn])
train_scores.append(pls.score(data_nn[train_nn],day_nn[train_nn]))
test_scores.append((pls.score(data_nn[test_nn],day_nn[test_nn])))
models.append(pls)
In [83]:
pl.plot(ns,train_scores,label='Train (90% of data)')
pl.plot(ns,test_scores,label='Test (10% of data)')
pl.xlabel('Number of Components')
pl.ylabel('Score')
pl.legend(loc='best')
pl.title('PLS Regression Scores (Nanog-Neo)')
Out[83]:
In [84]:
# the effect of overfitting is very slight
pl.plot(ns,np.array(train_scores) - np.array(test_scores))
Out[84]:
In [85]:
for i in range(5):
print('Dim: {0}, Score: {1:.3f}'.format(ns[i],test_scores[i]))
In [ ]:
In [86]:
m = models[1]
X_ = m.transform(data)
X_ = m.transform(data_nn)
In [88]:
pl.scatter(X_[:,0],X_[:,1],c=day_nn,cmap='winter',linewidths=0,alpha=0.5)
pl.xlabel("PLS Component 1")
pl.ylabel("PLS Component 2")
pl.colorbar()
pl.title("Partial least squares 'progression analysis'")
Out[88]:
In [89]:
xlim = np.min(X_[:,0]),np.max(X_[:,0])
ylim = np.min(X_[:,1]),np.max(X_[:,1])
xlim,ylim
Out[89]:
In [130]:
def PLS_progression_analysis(data,times,
sample_name='',
dims=list(range(1,10))+list(range(10,20,2)),
train_test_split=0.9):
# train-test split
np.random.seed(0)
ind = np.arange(len(times))
np.random.shuffle(ind)
split_index = len(ind)*train_test_split
train,test = ind[:split_index],ind[split_index:]
# fit models
train_scores = []
test_scores = []
models = []
for i in dims:
pls = PLSRegression(i)
pls.fit(data[train],times[train])
train_scores.append(pls.score(data[train],times[train]))
test_scores.append(pls.score(data[test],times[test]))
models.append(pls)
train_p = 100*train_test_split
test_p = 100-train_p
pl.plot(dims,train_scores,label='Train ({0:.0f}% of data)'.format(train_p))
pl.plot(dims,test_scores,label='Test ({0:.0f}% of data)'.format(test_p))
pl.xlabel('Number of Components')
pl.ylabel('Score')
pl.legend(loc='best')
title = 'PLS Regression Scores'
if len(sample_name) > 0:
title = ': '.join((title,sample_name))
pl.title(title)
return train_scores,test_scores,models
In [131]:
results = PLS_progression_analysis(sample,day[sample_ind],train_test_split=0.8)
In [132]:
results = PLS_progression_analysis(Oct4_processed,day_Oct4,'Oct4',train_test_split=0.8)
In [ ]:
In [ ]:
for t in times:
pl.figure()
kde = KernelDensity(0.25)
kde.fit(X_[day==t])
c = np.exp(kde.score_samples(X_[day==t]))
pl.scatter(X_[day==t,0],X_[day==t,1],c=c,cmap='winter',linewidths=0,alpha=0.5)
pl.xlim(*xlim)
pl.ylim(*ylim)
pl.xlabel("PLS Component 1")
pl.ylabel("PLS Component 2")
pl.title("PLS 'progression analysis:' Nanog-GFP, Day {0}".format(t))
pl.savefig('../figures/progression-animation/ng-day {0}.jpg'.format(t))
In [96]:
for t in times:
pl.figure()
kde = KernelDensity(0.25)
kde.fit(X_[day_nn==t])
c = np.exp(kde.score_samples(X_[day_nn==t]))
pl.scatter(X_[day_nn==t,0],X_[day_nn==t,1],c=c,cmap='winter',linewidths=0,alpha=0.5)
pl.xlim(*xlim)
pl.ylim(*ylim)
pl.xlabel("PLS Component 1")
pl.ylabel("PLS Component 2")
pl.title("PLS 'progression analysis:' Nanog-Neo, Day {0}".format(t))
pl.savefig('../figures/progression-animation/nn-day {0}.jpg'.format(t))
In [186]:
m = models[0]
X_ = m.transform(data)
In [202]:
# let's plot progression on just the leading 'progression axis' identified by PLS
In [204]:
day1 = X_[day==day[0]]
day1.shape
Out[204]:
In [184]:
from sklearn.neighbors import KernelDensity
kde = KernelDensity(2)
In [224]:
kde.fit(day1)
Out[224]:
In [225]:
min(day1),max(day1)
Out[225]:
In [226]:
x = np.arange(min(day1),max(day1),0.1)
y = kde.score_samples(np.reshape(x,(len(x),1)))
In [227]:
pl.plot(x,y)
Out[227]:
In [231]:
from sklearn.grid_search import GridSearchCV
# this snippet from documentation:
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
params = {'bandwidth': np.logspace(-1, 1, 10)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(day1)
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_
# estimate density
kde.fit(day1)
x = np.arange(min(day1),max(day1),0.1)
y = kde.score_samples(np.reshape(x,(len(x),1)))
In [232]:
pl.plot(x,y)
Out[232]:
In [198]:
x = np.arange(min(X_),max(X_),0.02)
In [238]:
progression = []
for d in set(day):
today = X_[day==d]
# this snippet from documentation:
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
params = {'bandwidth': np.logspace(-1, 1, 10)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(today)
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_
# estimate density
kde.fit(today)
y = kde.score_samples(np.reshape(x,(len(x),1)))
progression.append(y)
In [199]:
progression = []
for d in set(day):
today = X_[day==d]
kde = KernelDensity(0.2)
kde.fit(today)
y = kde.score_samples(np.reshape(x,(len(x),1)))
progression.append(y)
In [241]:
prog = np.exp(np.array(progression)).T
prog.shape
Out[241]:
In [242]:
aspect = float(prog.shape[1]) / prog.shape[0]
aspect
Out[242]:
In [243]:
pl.imshow(-prog,aspect=0.5*aspect,
interpolation='none',cmap='gray')
pl.xlabel('Timestep')
pl.ylabel('Progression')
Out[243]:
In [219]:
X_.shape
Out[219]:
In [229]:
pl.scatter(day+np.random.randn(len(day))*0.1,X_[:,0],linewidths=0,alpha=0.005)
Out[229]:
In [268]:
# color snippet modified from:
# http://stackoverflow.com/questions/8931268/using-colormaps-to-set-color-of-line-in-matplotlib
import matplotlib.colors as colors
import matplotlib.cm as cmx
jet = cm = pl.get_cmap('winter')
cNorm = colors.Normalize(vmin=0, vmax=len(times))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
for i,y in enumerate(progression):
colorVal = scalarMap.to_rgba(i)
pl.plot(x,np.exp(y),label=times[i],color=colorVal)
#pl.legend(loc='best')
pl.xlabel("PLS Component 1")
pl.ylabel("Probability density")
pl.title("1D Partial least squares 'progression analysis'")
Out[268]:
In [ ]:
# another way to do this, use a color-intensity channel to encode probability density,
# and then have time on the x axis and progression on the y axis
In [317]:
# color snippet modified from:
# http://stackoverflow.com/questions/8931268/using-colormaps-to-set-color-of-line-in-matplotlib
import matplotlib.colors as colors
import matplotlib.cm as cmx
jet = cm = pl.get_cmap('winter')
cNorm = colors.Normalize(vmin=0, vmax=5)
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
for i,y in enumerate(progression):
pl.figure()
colorVal = scalarMap.to_rgba(0)
pl.plot(x,np.exp(y),color=colorVal,linewidth=3)
pl.ylim(0,0.45)
for j in range(5):
if i-j>=0:
colorVal = scalarMap.to_rgba(j+1)
pl.plot(x,np.exp(progression[i-j]),color=colorVal,alpha=1/(j+1),linewidth=min(1,3-j))
#pl.legend(loc='best')
pl.xlabel("PLS Component 1")
pl.ylabel("Probability density")
pl.title("1D PLS 'progression analysis' Day {0}".format(times[i]))
pl.savefig('../figures/1d-progression-animation/ng-day {0}.jpg'.format(times[i]))
In [235]:
from scipy.spatial import distance
In [236]:
prog = np.array(progression)
prog.shape
Out[236]:
In [237]:
distmat = distance.squareform(distance.pdist(prog,'canberra'))
distmat.shape
Out[237]:
In [244]:
pl.imshow(distmat,interpolation='none',cmap='Blues')
pl.xlabel('Timestep')
pl.ylabel('Timestep')
Out[244]:
In [134]:
pl.scatter(models[-1].predict(data),day,alpha=0.1,linewidths=0)
pl.plot(day,day)
Another idea: cluster the full dataset, and describe each timepoint by the relative occupancy of each cluster
In [12]:
from sklearn.cluster import MiniBatchKMeans
In [13]:
num_clust=20
In [14]:
km = MiniBatchKMeans(num_clust)
In [359]:
km.fit(data)
Out[359]:
In [360]:
y = km.predict(data)
In [361]:
def num_in_clust(Y,num_clusters=50):
assert(num_clusters >= max(Y))
occupancy = np.zeros(num_clusters)
for y in Y:
occupancy[y] += 1
return occupancy
In [361]:
In [362]:
pl.bar(range(num_clust),num_in_clust(y,num_clust))
pl.title('Overall')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[0]],num_clust))
pl.title('Initial')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[-1]],num_clust))
pl.title('Final')
Out[362]:
In [363]:
featurized = np.array([num_in_clust(y[day==t]) for t in times])
featurized.shape
Out[363]:
In [364]:
distmat = distance.squareform(distance.pdist(featurized,'canberra'))
pl.imshow(distmat,interpolation='none',cmap='Blues')
Out[364]:
In [365]:
#clust_ind = sorted(range(50),key=lambda i:featurized[:,i].dot(times))
clust_ind = sorted(range(num_clust),key=lambda i:(y==i).dot(np.arange(len(y))))
In [366]:
pl.bar(range(num_clust),num_in_clust(y,num_clust))
pl.title('Overall')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[0]],num_clust))
pl.title('Initial')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[-1]],num_clust))
pl.title('Final')
Out[366]:
In [367]:
clust_mat = np.array([num_in_clust(y[day==t],num_clust)[clust_ind] for t in times]).T
In [368]:
for i in range(len(times)):
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[i]],num_clust)[clust_ind])
pl.title('Cluster occupancies: Day {0}'.format(times[i]))
pl.xlabel('Cluster ID')
pl.ylabel('Occupancy')
pl.ylim(0,np.max(clust_mat))
pl.savefig('../figures/occupancy-animation/ng-day {0}.jpg'.format(times[i]))
In [369]:
aspect = float(clust_mat.shape[1]) / clust_mat.shape[0]
pl.imshow(clust_mat,aspect=aspect,
interpolation='none',cmap='Blues')
pl.xlabel('Timestep')
pl.ylabel('Cluster ID')
pl.title('Cluster occupancy per time step')
pl.colorbar()
Out[369]:
In [370]:
pl.imshow(clust_mat,interpolation='none')
Out[370]:
In [356]:
y[clust_ind]
Out[356]:
In [376]:
y.shape
Out[376]:
In [380]:
X_ = pca.fit_transform(data)
pl.scatter(X_[:,0],X_[:,1],c=[clust_ind[i] for i in y],cmap='Blues',linewidths=0,alpha=0.5)
pl.colorbar()
Out[380]:
In [33]:
len(all_data)
Out[33]:
In [21]:
all_data = np.vstack((data,data_nn))
all_data.shape
Out[21]:
In [22]:
pca = PCA()
pca.fit(data)
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_[:2]))
In [23]:
pca = PCA()
pca.fit(data_nn)
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_[:2]))
In [386]:
X_ = models[1].transform(data)
X_.shape
Out[386]:
In [387]:
import seaborn
In [399]:
xlim = np.min(X_[:,0]),np.max(X_[:,0])
ylim = np.min(X_[:,1]),np.max(X_[:,1])
In [400]:
for i,d in enumerate(set(day)):
seaborn.kdeplot(X_[day==d]);
pl.xlim(*xlim)
pl.ylim(*ylim)
pl.savefig('../figures/2d-contour-progression-animation/{0}.jpg'.format(i))
pl.close()
In [ ]: